import pandas as pd

if __name__ == '__main__':
    input_data_path = "C:\\Users\\26818\\Desktop\\datamining\\bank\\bank-full_step1.csv"
    processed_data_path = 'C:\\Users\\26818\\Desktop\\datamining\\bank\\bank-full_step2.csv'

    print("Loading data...")
    data = pd.read_csv(input_data_path)
    data.info()
    print("Preprocessing data...")
    numeric_attrs = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', ]
    bin_attrs = ['default', 'housing', 'loan']
    cate_attrs = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
    fill_attrs = []
    for i in bin_attrs + cate_attrs:
        if data[data[i] == 'unknown']['y'].count() < 500:
            # delete col containing unknown
            data = data[data[i] != 'unknown']
        else:
            fill_attrs.append(i)

    for c in fill_attrs:
        print()
        print(data[c].value_counts())
    print(fill_attrs.__len__())

    import numpy as np
    import matplotlib.pyplot as plt

    data = np.random.randn(100)
    data2 = np.random.randn(100)
    print(data2)
    fig, ax = plt.subplots(1, 1)
    ax.scatter(data, data2)
    plt.show()
